pip install -U modin -q
pip install klib -q
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import klib
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
import missingno as msno
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
from scipy.stats import anderson
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from timeit import default_timer as timer
%time data=pd.read_csv("/kaggle/input/intel-oneapi-predict-the-quality-of-freshwater/dataset.csv")
import modin.pandas as pd
%time data=pd.read_csv("/kaggle/input/intel-oneapi-predict-the-quality-of-freshwater/dataset.csv")
%time data.info()
data.isnull().sum()
msno.bar(data.drop(columns=['Index','Target']))
msno.heatmap(data.drop(columns=['Index','Target']))
msno.dendrogram(data.drop(columns=['Index','Target']))
data.head()
sns.countplot(x=data['Target'].values)
data['Target'].value_counts()
def box_plot(df,col,rot=None):
_=plt.figure(figsize=(8,6))
_=sns.boxplot(y=df[col])
_=plt.title(col.capitalize()+" Distribution",fontsize=25)
_=plt.ylabel(col,fontsize=20,rotation=rot)
_=plt.yticks(fontsize=14)
def side_by_side_plot(df,grp,valcol,rot=None,title=""):
clr="Paired"
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(18,8))
fig.tight_layout()
sns.kdeplot(x=df[valcol].values, hue=df[grp].values,ax=ax1,palette=clr)
ax1.set_title(grp.capitalize()+" Wise "+title+" Distribution",size=15)
ax1.set_xlabel(valcol,fontsize=20)
sns.boxplot(x=df[grp].values,y=df[valcol].values,ax=ax2,palette=clr)
ax2.set_title(grp.capitalize()+" Wise "+title+" Distribution",size=15)
ax2.set_xlabel(grp,fontsize=20)
ax2.tick_params(rotation=rot)
klib.dist_plot(data['pH']);
box_plot(data,'pH',rot=90)
data['pH'].describe()
sns.countplot(y=data['Source'].values);
data['Source'].value_counts()
side_by_side_plot(data,'Source','pH',title="Water's Ph Value");
data.groupby('Color')['pH'].describe()
sns.countplot(y=data['Color'].values,order=data['Color'].value_counts().index);
data['Color'].value_counts()
side_by_side_plot(data,'Color','pH',title="Water's Ph Value");
def water_source_color_measure(df,valcol):
fig=plt.subplots(figsize=(20, 20))
for i,cat in enumerate(['Lake', 'River', 'Ground', 'Spring', 'Stream', 'Aquifer','Reservoir', 'Well']):
_=plt.subplot(4,2,i+1)
_=sns.boxplot(x=df[df['Source']==cat]['Color'].values,y=df[df['Source']==cat][valcol].values)
_=plt.title(f"{cat} Water's {valcol} Value Distribution by Different Color",fontsize=15)
_=plt.xticks(fontsize=15)
_=plt.tight_layout()
plt.show()
water_source_color_measure(data,'pH');
sns.kdeplot(x=data['pH'],hue=data['Target']);
data.groupby('Target')['pH'].describe()
klib.dist_plot(data['Iron']);
box_plot(data,'Iron',rot=90);
data['Iron'].describe()
water_source_color_measure(data,'Iron');
sns.kdeplot(x=data['Iron'],hue=data['Target']);
data.groupby('Target')['Iron'].describe()
klib.dist_plot(data['Nitrate']);
box_plot(data,'Nitrate',rot=90);
data['Nitrate'].describe()
water_source_color_measure(data,'Nitrate');
sns.kdeplot(x=data['Nitrate'],hue=data['Target']);
data.groupby('Target')['Nitrate'].describe()
klib.dist_plot(data['Chloride']);
box_plot(data,'Chloride',rot=90);
data['Chloride'].describe()
water_source_color_measure(data,'Chloride');
sns.kdeplot(x=data['Chloride'],hue=data['Target']);
data.groupby('Target')['Chloride'].describe()
klib.dist_plot(data['Lead']);
box_plot(data,'Lead',rot=90);
data['Lead'].describe()
water_source_color_measure(data,'Lead');
sns.kdeplot(x=data['Lead'],hue=data['Target']);
data.groupby('Target')['Lead'].describe()
klib.dist_plot(data['Zinc']);
box_plot(data,'Zinc',rot=90);
data['Zinc'].describe()
water_source_color_measure(data,'Zinc');
sns.kdeplot(x=data['Zinc'],hue=data['Target']);
data.groupby('Target')['Zinc'].describe()
klib.dist_plot(data['Turbidity']);
box_plot(data,'Turbidity',rot=90);
data['Turbidity'].describe()
water_source_color_measure(data,'Turbidity');
sns.kdeplot(x=data['Turbidity'],hue=data['Target']);
data.groupby('Target')['Turbidity'].describe()
klib.dist_plot(data['Fluoride']);
box_plot(data,'Fluoride',rot=90);
data['Fluoride'].describe()
water_source_color_measure(data,'Fluoride');
sns.kdeplot(x=data['Fluoride'],hue=data['Target']);
data.groupby('Target')['Fluoride'].describe()
klib.dist_plot(data['Copper']);
box_plot(data,'Copper',rot=90);
data['Copper'].describe()
water_source_color_measure(data,'Copper');
sns.kdeplot(x=data['Copper'],hue=data['Target']);
data.groupby('Target')['Copper'].describe()
klib.dist_plot(data['Odor']);
box_plot(data,'Odor',rot=90);
data['Odor'].describe()
water_source_color_measure(data,'Odor');
sns.kdeplot(x=data['Odor'],hue=data['Target']);
data.groupby('Target')['Odor'].describe()
klib.dist_plot(data['Sulfate']);
box_plot(data,'Sulfate',rot=90);
data['Sulfate'].describe()
water_source_color_measure(data,'Sulfate');
sns.kdeplot(x=data['Sulfate'],hue=data['Target']);
data.groupby('Target')['Sulfate'].describe()
klib.dist_plot(data['Conductivity']);
box_plot(data,'Conductivity',rot=90);
data['Conductivity'].describe()
water_source_color_measure(data,'Conductivity');
sns.kdeplot(x=data['Conductivity'],hue=data['Target']);
data.groupby('Target')['Conductivity'].describe()
klib.dist_plot(data['Chlorine']);
box_plot(data,'Chlorine',rot=90);
data['Chlorine'].describe()
water_source_color_measure(data,'Chlorine');
sns.kdeplot(x=data['Chlorine'],hue=data['Target']);
data.groupby('Target')['Chlorine'].describe()
klib.dist_plot(data['Manganese']);
box_plot(data,'Manganese',rot=90);
data['Manganese'].describe()
water_source_color_measure(data,'Manganese');
sns.kdeplot(x=data['Manganese'],hue=data['Target']);
data.groupby('Target')['Manganese'].describe()
klib.dist_plot(data['Total Dissolved Solids']);
box_plot(data,'Total Dissolved Solids',rot=90);
data['Total Dissolved Solids'].describe()
water_source_color_measure(data,'Total Dissolved Solids');
sns.kdeplot(x=data['Total Dissolved Solids'],hue=data['Target']);
data.groupby('Target')['Total Dissolved Solids'].describe()
klib.dist_plot(data['Water Temperature']);
box_plot(data,'Water Temperature',rot=90);
data['Water Temperature'].describe()
water_source_color_measure(data,'Water Temperature');
sns.kdeplot(x=data['Water Temperature'],hue=data['Target']);
data.groupby('Target')['Water Temperature'].describe()
klib.dist_plot(data['Air Temperature']);
box_plot(data,'Air Temperature',rot=90);
data['Air Temperature'].describe()
water_source_color_measure(data,'Air Temperature');
sns.kdeplot(x=data['Air Temperature'],hue=data['Target']);
data.groupby('Target')['Air Temperature'].describe()
def w_test(df,targetcol,numcol):
t_stat, p_val = stats.ranksums(df[df[targetcol] == 1][[numcol,targetcol]].dropna()[numcol],
df[df[targetcol] == 0][[numcol,targetcol]].dropna()[numcol])
return [t_stat ,p_val]
t_stat=[]
p_val=[]
col_list=['pH', 'Iron', 'Nitrate', 'Chloride', 'Lead', 'Zinc',
'Turbidity', 'Fluoride', 'Copper', 'Odor', 'Sulfate', 'Conductivity',
'Chlorine', 'Manganese', 'Total Dissolved Solids',
'Water Temperature', 'Air Temperature']
for col in col_list:
#res=[]
#t_stat, p_val = w_test(data,"Target",col)
result=w_test(data,"Target",col)
t_stat.append(result[0])
p_val.append(result[1])
stat_test=pd.DataFrame({'column_name':col_list,'t_tstat':t_stat,'p_value':p_val})
stat_test['result']=stat_test['p_value'].apply(lambda x:"significant" if x<0.05 else "not_significant" )
stat_test